In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import os

os.getcwd()
Out[1]:
'/Users/piyushmishra/Downloads/Problems/Energy'
In [2]:
train = pd.read_csv("/Users/piyushmishra/Downloads/Problems/Energy/train.csv")
In [3]:
train.head(10)
Out[3]:
Observation T1 RH_1 T2 RH_2 T3 RH_3 T4 RH_4 T5 ... RH_8 T9 RH_9 T_out Press_mm_hg RH_out Windspeed Visibility Tdewpoint Energy
0 1111 22.700000 37.200000 21.000000 38.000000 23.390000 37.290000 22.832857 34.942857 20.500000 ... 45.360000 20.200000 38.663333 12.80 760.050000 62.000000 3.500000 28.0 5.65 70
1 1112 21.500000 41.045000 20.500000 39.133333 22.926667 39.526667 21.700000 34.126667 18.633333 ... 34.663333 19.730000 37.933333 8.47 764.166667 48.166667 8.000000 26.5 -1.92 210
2 1113 21.666667 38.000000 22.600000 35.700000 21.890000 36.590000 22.000000 35.530000 19.000000 ... 38.545000 19.790000 39.430000 10.60 757.600000 57.000000 2.000000 27.0 2.40 50
3 1114 23.290000 38.530000 21.671429 38.000000 24.290000 36.200000 23.100000 35.090000 22.042857 ... 38.600000 22.600000 36.194286 11.60 760.600000 63.000000 1.000000 40.0 4.67 50
4 1115 24.000000 42.560000 26.830000 34.356000 26.530000 40.333333 24.500000 40.900000 21.500000 ... 44.230000 22.600000 44.090000 17.70 756.983333 63.000000 1.000000 21.5 10.40 250
5 1116 18.390000 43.363333 17.700000 42.433333 19.290000 42.500000 18.600000 41.663333 17.200000 ... 47.600000 16.700000 46.000000 1.70 749.400000 97.000000 3.833333 32.5 1.25 180
6 1117 24.500000 36.060000 24.434000 34.634000 24.600000 33.590000 24.890000 34.700000 24.465714 ... 37.626667 23.405714 34.880000 9.43 756.300000 61.666667 5.000000 40.0 2.40 100
7 1118 23.000000 34.863333 21.000000 35.223333 23.000000 36.090000 22.633333 33.933333 21.390000 ... 39.363333 20.318571 37.200000 8.00 765.950000 68.166667 1.000000 40.0 2.33 60
8 1119 23.100000 35.000000 23.390000 31.856667 23.000000 35.290000 23.000000 33.200000 20.856667 ... 37.863333 20.290000 36.700000 18.60 760.600000 38.000000 4.500000 40.0 4.05 50
9 1120 21.000000 37.200000 18.500000 40.656667 20.390000 37.260000 21.600000 34.463333 20.093333 ... 38.123333 17.290000 35.590000 4.67 748.833333 72.666667 3.666667 40.0 0.20 100

10 rows × 26 columns

In [4]:
train.describe()
Out[4]:
Observation T1 RH_1 T2 RH_2 T3 RH_3 T4 RH_4 T5 ... RH_8 T9 RH_9 T_out Press_mm_hg RH_out Windspeed Visibility Tdewpoint Energy
count 15780.000000 15780.000000 15780.000000 15780.000000 15780.000000 15780.000000 15780.000000 15780.000000 15780.000000 15780.000000 ... 15780.000000 15780.000000 15780.000000 15780.000000 15780.000000 15780.000000 15780.000000 15780.000000 15780.000000 15780.000000
mean 9000.500000 21.683008 40.263077 20.339386 40.426604 22.265359 39.242550 20.853829 39.037623 19.591562 ... 42.934090 19.489357 41.559527 7.414789 755.514104 79.756982 4.041824 38.378577 3.765539 97.294043
std 4555.437959 1.613157 3.983819 2.199896 4.085470 2.013391 3.247703 2.046331 4.337781 1.849086 ... 5.223342 2.020488 4.135306 5.321081 7.385701 14.877160 2.447902 11.810273 4.202025 100.932234
min 1111.000000 16.790000 27.023333 16.100000 20.463333 17.200000 28.766667 15.100000 27.660000 15.330000 ... 29.600000 14.890000 29.166667 -5.000000 729.300000 24.000000 0.000000 1.000000 -6.600000 10.000000
25% 5055.750000 20.730000 37.360000 18.790000 37.890000 20.790000 36.900000 19.500000 35.530000 18.270000 ... 39.066667 18.000000 38.530000 3.680000 750.933333 70.166667 2.000000 29.000000 0.900000 50.000000
50% 9000.500000 21.600000 39.663333 20.000000 40.500000 22.100000 38.560000 20.666667 38.433333 19.390000 ... 42.376214 19.390000 40.900000 6.930000 756.100000 83.666667 3.666667 40.000000 3.470000 60.000000
75% 12945.250000 22.600000 43.090000 21.530000 43.290000 23.290000 41.760000 22.100000 42.193333 20.633333 ... 46.560000 20.600000 44.363333 10.400000 760.900000 91.666667 5.500000 40.000000 6.550000 100.000000
max 16890.000000 26.260000 63.360000 29.856667 56.026667 29.236000 50.163333 26.200000 51.090000 25.745000 ... 58.780000 24.500000 53.326667 26.100000 772.300000 100.000000 14.000000 66.000000 15.500000 850.000000

8 rows × 26 columns

In [5]:
train.shape
Out[5]:
(15780, 26)
In [6]:
 train.apply(lambda x: sum(x.isnull()),axis=0) 
Out[6]:
Observation    0
T1             0
RH_1           0
T2             0
RH_2           0
T3             0
RH_3           0
T4             0
RH_4           0
T5             0
RH_5           0
T6             0
RH_6           0
T7             0
RH_7           0
T8             0
RH_8           0
T9             0
RH_9           0
T_out          0
Press_mm_hg    0
RH_out         0
Windspeed      0
Visibility     0
Tdewpoint      0
Energy         0
dtype: int64
In [7]:
import sklearn
import shap
from sklearn.model_selection import train_test_split


y = train.Energy
x = train.drop('Energy', axis=1)
seed = 1

#spliting the dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state= seed)
In [8]:
# create training and testing vars
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
(12624, 25) (12624,)
(3156, 25) (3156,)
In [9]:
# fit a model
from sklearn.ensemble import RandomForestRegressor
rf= RandomForestRegressor()
model = rf.fit(X_train, y_train)
predictions = rf.predict(X_test)
In [10]:
predictions[0:5]
Out[10]:
array([ 232.,   47.,   99.,   51.,  152.])
In [11]:
print(rf.score(X_test, y_test))
0.463924631266
In [12]:
# print the JS visualization code to the notebook
shap.initjs()
In [13]:
shap_values = shap.TreeExplainer(model).shap_values(X_test)

# visualize the first prediction's explanation
shap.force_plot(shap_values[0,:], X_test.iloc[0,:])
Out[13]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security.

The RH_1 i.e. Humidity in Kitchen Area is impacting more on the output variable i.e. Energy used and RH_7 i.e. Humidity in Ironing room have less impact on the output

In [14]:
# visualize the training set predictions
shap.force_plot(shap_values, X_test)
/Users/piyushmishra/anaconda3/lib/python3.6/site-packages/shap/plots.py:693: UserWarning: shap.force_plot is slow many thousands of rows, try subsampling your data.
  warnings.warn("shap.force_plot is slow many thousands of rows, try subsampling your data.")
Out[14]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security.
In [15]:
# create a SHAP dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("RH_7", shap_values, X_test)
In [16]:
# summarize the effects of all the features
shap.summary_plot(shap_values, X_test)